xxxxxxxxxximport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsxxxxxxxxxx#### as data is in the form of excel, use read_excel commandxxxxxxxxxxdf=pd.read_excel('C:\FormMyConputer\Data Analysis\Finance_Data_Analysis/Bank_Personal_Loan_Modelling.xlsx',1)df.head()xxxxxxxxxxdf.shapexxxxxxxxxxdf.isnull().sum()xxxxxxxxxx# ID and Zipcode might be removed as they may not be useful for our analysisdf.drop(['ID','ZIP Code'],axis=1,inplace=True)xxxxxxxxxxdf.columnsxxxxxxxxxximport plotly.express as pxxxxxxxxxxxfig=px.box(df,y=['Age', 'Experience', 'Income', 'Family', 'Education'])fig.show()xxxxxxxxxx#### Five point summary suggest that Experience has negative value(This should be fixed). we can see the Min, Max, mean and std deviation for all key attributes of the dataset Income has too much noise and slightly skewed right, Age and exp are equally distributed.we can see the Min, Max, mean and std deviation for all key attributes of the dataset
Income has too much noise and slightly skewed right, Age and exp are equally distributed.xxxxxxxxxx#### check if there is skewness in data or not!!xxxxxxxxxxdf.skew()xxxxxxxxxxdf.dtypesxxxxxxxxxxdf.hist(figsize=(20,20))xxxxxxxxxx#### INFERENCE from Histogram 1.Age & Experience are to an extent equally distributed 2.Income & Credit card spending are skewed to the left 3.We have more Undergraduates than Graduate and Advanced & Professional 4.60% of customers have enabled online banking and went digital1.Age & Experience are to an extent equally distributed
2.Income & Credit card spending are skewed to the left
3.We have more Undergraduates than Graduate and Advanced & Professional
4.60% of customers have enabled online banking and went digitalxxxxxxxxxximport seaborn as snsxxxxxxxxxxsns.distplot(df['Experience'])xxxxxxxxxxdf['Experience'].mean()xxxxxxxxxxNegative_exp=df[df['Experience']<0]Negative_exp.head()xxxxxxxxxxsns.distplot(Negative_exp['Age'])xxxxxxxxxxNegative_exp['Experience'].mean()xxxxxxxxxxNegative_exp.sizexxxxxxxxxxprint('There are {} records which has negative values for experience, approx {} %'.format(Negative_exp.size , ((Negative_exp.size/df.size)*100)))xxxxxxxxxxdata=df.copy()xxxxxxxxxxdata.head()xxxxxxxxxxxxxxxxxxxx#### use numpy where function to change the negative values to mean value derived from data with the same age groupxxxxxxxxxxdata['Experience']=np.where(data['Experience']<0,data['Experience'].mean(),data['Experience'])xxxxxxxxxxdata[data['Experience']<0]xxxxxxxxxxplt.figure(figsize=(10,6))sns.heatmap(df.corr(),annot=True)xxxxxxxxxx#### We could see that Age & Experience are very strongly correlated,#### Hence it is fine for us to go with Age and drop Experience to avoid multi-colinearity issue.xxxxxxxxxxdata=data.drop(['Experience'],axis=1)xxxxxxxxxxdata.head()xxxxxxxxxxxxxxxxxxxxdata['Education'].unique()xxxxxxxxxxdef mark(x): if x==1: return 'Undergrad' elif x==2: return 'Graduate' else: return 'Advanced/Professional'xxxxxxxxxxdata['Edu_mark']=data['Education'].apply(mark)xxxxxxxxxxdata.head()xxxxxxxxxxEDU_dis=data.groupby('Edu_mark')['Age'].count()xxxxxxxxxxEDU_disxxxxxxxxxxfig=px.pie(data,values=EDU_dis, names=EDU_dis.index,title='Pie CHart')fig.show()xxxxxxxxxx#### Inference :We could see that We have more Undergraduates 41.92% than graduates(28.06%) & Advanced Professional(30.02%)xxxxxxxxxxxxxxxxxxxxdata.columnsxxxxxxxxxx#### Lets Explore the account holder's distributionxxxxxxxxxxdef Security_CD(row): if (row['Securities Account']==1) & (row['CD Account']==1): return 'Holds Securites & Deposit' elif (row['Securities Account']==0) & (row['CD Account']==0): return 'Does not Holds Securites or Deposit' elif (row['Securities Account']==1) & (row['CD Account']==0): return ' Holds only Securites ' elif (row['Securities Account']==0) & (row['CD Account']==1): return ' Holds only Deposit' xxxxxxxxxxdata['Account_holder_category']=data.apply(Security_CD,axis=1)xxxxxxxxxxdata.head()xxxxxxxxxxvalues=data['Account_holder_category'].value_counts()values.indexxxxxxxxxxxxxxxxxxxxxfig=px.pie(data,values=values, names=values.index,title='Pie CHart')fig.show()xxxxxxxxxx#### We could see that alomst 87% of customers do not hold any securities or deposit, and 3 % hold both securities as well as deposit. It will be good if we encourage those 87% to open any of these account as it will improve the assests of the bankxxxxxxxxxxxxxxxxxxxxdata.columnsxxxxxxxxxxpx.box(data,x='Education',y='Income',facet_col='Personal Loan')xxxxxxxxxx#### Inference:From the above plot we could say that Income of customers who availed personal loan are alomst same irrescpective of their Educationxxxxxxxxxxxxxxxxxxxxplt.figure(figsize=(12,8))sns.distplot(data[data['Personal Loan']==0]['Income'],hist=False,label='Income with no personal loan')sns.distplot(data[data['Personal Loan']==1]['Income'],hist=False,label='Income with personal loan')plt.legend()xxxxxxxxxx#### Conclusion: Customers Who have availed personal loan seem to have higher income than those who do not have personal loanxxxxxxxxxxxxxxxxxxxxdef plot(col1,col2,label1,label2,title): plt.figure(figsize=(12,8)) sns.distplot(data[data[col2]==0][col1],hist=False,label=label1) sns.distplot(data[data[col2]==1][col1],hist=False,label=label2) plt.legend() plt.title(title)xxxxxxxxxxplot('Income','Personal Loan','Income with no personal loan','Income with personal loan','Income Distribution')xxxxxxxxxxxxxxxxxxxxplot('CCAvg','Personal Loan','Credit card avg with no personal loan','Credit card avg with personal loan','Credit card avg Distribution')xxxxxxxxxxplot('Mortgage','Personal Loan','Mortgage of customers with no personal loan','Mortgage of customers with personal loan','Mortgage of customers Distribution')xxxxxxxxxx#### People with high mortgage value, i.e more than 400K have availed personal Loanxxxxxxxxxxxxxxxxxxxxdata.columnsxxxxxxxxxxcol_names=['Securities Account','Online','Account_holder_category','CreditCard']xxxxxxxxxxfor i in col_names: plt.figure(figsize=(10,5)) sns.countplot(x=i,hue='Personal Loan',data=data)xxxxxxxxxx#### From the above graph we could infer that , customers who hold deposit account & customers who do not hold either a securities account or deposit account have aviled personal loanxxxxxxxxxxxxxxxxxxxx### Q.. How Age of a person is going to be a factor in availing loan ??? Does Income of a person have an impact on availing loan ??? Does the family size makes them to avail loan ???¶xxxxxxxxxxsns.scatterplot(data['Age'],data['Personal Loan'],hue=data['Family'])xxxxxxxxxximport scipy.stats as statsxxxxxxxxxxHo='Age does not have impact on availing personal loan'Ha='Age does have impact on availing personal loan'xxxxxxxxxxAge_no=np.array(data[data['Personal Loan']==0]['Age'])Age_yes=np.array(data[data['Personal Loan']==1]['Age'])xxxxxxxxxxt,p_value=stats.ttest_ind(Age_no,Age_yes,axis=0)if p_value<0.05: print(Ha,' as the p_value is less than 0.05 with a value of {}'.format(p_value))else: print(Ho,' as the p_value is greater than 0.05 with a value of {}'.format(p_value))xxxxxxxxxxxxxxxxxxxxdef Hypothesis(col1,col2,HO,Ha): arr1=np.array(data[data[col1]==0][col2]) arr2=np.array(data[data[col1]==1][col2]) t,p_value=stats.ttest_ind(arr1,arr2,axis=0) if p_value<0.05: print('{}, as the p_value is less than 0.05 with a value of {}'.format(Ha,p_value)) else: print('{} as the p_value is greater than 0.05 with a value of {}'.format(HO,p_value))xxxxxxxxxxHypothesis('Personal Loan','Age',HO='Age does not have impact on availing personal loan',Ha='Age does have impact on availing personal loan')xxxxxxxxxxxxxxxxxxxx### Q..Income of a person has significant impact on availing Personal Loan or not?xxxxxxxxxxHypothesis(col1='Personal Loan',col2='Income',HO='Income does not have impact on availing personal loan',Ha='Income does have impact on availing personal loan')xxxxxxxxxx#### Income have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :0.0xxxxxxxxxxxxxxxxxxxx### Q..Number of persons in the family has significant impact on availing Personal Loan or not?xxxxxxxxxxHypothesis('Personal Loan','Family',HO='AgFamily does not have impact on availing personal loan',Ha='Family does have impact on availing personal loan')xxxxxxxxxx#### Family have phenomenal significance on availing personal Loan , As the P_value is less than 0.05 with a value of :1.4099040685673807e-05xxxxxxxxxx